Winter Olympics Medals over Time

Data

The main data is provided in “winter.csv”, containing the following variables on all medal winners in all winter olympics from 1924 to 2014:

  • Year: year of the winter olympics
  • City: city the olympics is held
  • Sport: the type of sport
  • Discipline: a grouping of disciplines
  • Event: the particular event / competition
  • Athlete: name of the athlete
  • Country: country origin of the athlete
  • Gender: gender of the athlete
  • Medal: type of medal won

For example, an event is a competition in a sport or discipline that gives rise to a ranking. Thus, skiing is a sport, while cross-country skiing, Alpine skiing, snowboarding, ski jumping and Nordic combined are disciplines. Alpine skiing is a discipline, while the super-G, giant slalom, slalom and combined are events.

In addition, information about the countries is available in a separate spreadsheet “dictionary.csv”, including the IOC Country Code, Population, and GDP per capita.

Visualizations

Read in data
olymp <- read.csv("../data/winter.csv")
gdp <- read.csv("../data/dictionary.csv")

olymp$Year <- as.Date(strptime(olymp$Year, format = "%Y"))
colnames(gdp) <- c("Country_Full", "Country", "Population", "GDP.per.Capita")

1. Medal Counts over Time

Combine the information in the two spreadsheets winter.csv and dictionary.csv. Because some countries that competed under different designations in the past (e.g. Germany and Russia), I chose to combine Russia with URS, Germany with EUA, FRG and GDR, and Czech Republic with TCH.

A summary of how many winter games each country medaled in, and how many medals of each type the country won can be seen below.

# How many winter game Medal types per country
# Combine RUS and URS, and GER, EUA, FRG, GDR
olymp$Country <- as.character(olymp$Country)
olymp$Country <- ifelse(olymp$Country == "URS", "RUS", olymp$Country) # Russia
olymp$Country <- ifelse(olymp$Country == "EUA", "GER", olymp$Country) # Germany
olymp$Country <- ifelse(olymp$Country == "FRG", "GER", olymp$Country)
olymp$Country <- ifelse(olymp$Country == "GDR", "GER", olymp$Country)
olymp$Country <- ifelse(olymp$Country == "TCH", "CZE", olymp$Country) # Czech
olymp$Country <- as.factor(olymp$Country)

descriptive1 <- olymp %>% 
  mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze"))) %>%
  group_by(Country, Medal) %>% 
  mutate(medal_yr_cnt = length(Medal)) %>%
  select(Country, Medal, medal_yr_cnt) %>%
  unique() %>% 
  spread(key = Medal, value = medal_yr_cnt) %>% 
  ungroup() %>% 
  select(Country, Gold, Silver, Bronze)
descriptive1[is.na(descriptive1)] <- 0

# How many years in Olympics
descriptive2 <- olymp %>% 
  select(Country, Year) %>% 
  group_by(Country) %>% 
  unique() %>% 
  summarize(num_olymp = n())

# Combination -- Number of Olympics and Medals per Country
descriptive <- inner_join(descriptive1, descriptive2, by = "Country")
descriptive$total_medals <- rowSums(descriptive[, c(2,3,4)])

# Joining GDP information for later
descriptive_gdp <- inner_join(descriptive, gdp, by = "Country")
head(descriptive)
## # A tibble: 6 x 6
##   Country   Gold Silver Bronze num_olymp total_medals
##   <fct>    <dbl>  <dbl>  <dbl>     <int>        <dbl>
## 1 AUS       5.00   3.00   7.00         6        15.0 
## 2 AUT      79.0   98.0  103           22       280   
## 3 BEL       2.00   4.00   7.00         4        13.0 
## 4 BLR       6.00   4.00   5.00         6        15.0 
## 5 BUL       1.00   2.00   3.00         4         6.00
## 6 CAN     315    203    107           22       625

Additional Data Wrangling

# Who are the top 10 medal producers over time?
top10 <- descriptive %>% 
  arrange(desc(total_medals)) %>% 
  mutate(rank = row_number()) %>% 
  filter(rank <= 10)
head(top10)
## # A tibble: 6 x 7
##   Country  Gold Silver Bronze num_olymp total_medals  rank
##   <fct>   <dbl>  <dbl>  <dbl>     <int>        <dbl> <int>
## 1 RUS     344      187    172        15          703     1
## 2 USA     167      319    167        22          653     2
## 3 GER     226      208    203        20          637     3
## 4 CAN     315      203    107        22          625     4
## 5 NOR     159      171    127        22          457     5
## 6 FIN      66.0    147    221        22          434     6
# Calculate how many medals per year 
medalsyear <- olymp %>% 
  group_by(Year, Country) %>% 
  filter(Country %in% top10$Country) %>% 
  summarize(medal_yr_cnt = length(Medal)) 

# Split by gold, silver, bronze medals, per year
medals_year <- olymp %>% 
  mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze"))) %>%
  group_by(Year, Country, Medal) %>% 
  summarize(medal_yr_cnt = length(Medal)) 
head(medals_year)
## # A tibble: 6 x 4
## # Groups:   Year, Country [4]
##   Year       Country Medal  medal_yr_cnt
##   <date>     <fct>   <fct>         <int>
## 1 1924-02-18 AUT     Gold              3
## 2 1924-02-18 AUT     Silver            1
## 3 1924-02-18 BEL     Bronze            5
## 4 1924-02-18 CAN     Gold              9
## 5 1924-02-18 FIN     Gold              4
## 6 1924-02-18 FIN     Silver            8
# filter for only countries that have at least 100 medals
medals100 <- olymp %>% 
  group_by(Country) %>% 
  summarize(total_medals = length(Medal)) %>% 
  filter(total_medals >= 100)
head(medals100)
## # A tibble: 6 x 2
##   Country total_medals
##   <fct>          <int>
## 1 AUT              280
## 2 CAN              625
## 3 CZE              233
## 4 FIN              434
## 5 FRA              152
## 6 GER              637
Olympic Medals Over Time

I propose to use the following visualization for a look at the medals won by the top 10 medal-winning countries over time. I think the plot is clear in that it uses familiar colors (gold, silver, bronze) and doesn’t totally overwhelm the reader with information. While I actually liked my second plot (below) better, I can’t seem to run the code any longer without a strange error that I’ll fix within the week.

# Split by Country, over time 
#png("../fig/medcountry_time.png")
medcountry <- medals_year %>%  
  filter(Country %in% top10$Country) %>% 
  ggplot(aes(x = Year, y = medal_yr_cnt, na.rm = TRUE)) +
  geom_jitter(aes(color = Medal)) +
  scale_color_manual(values=c("gold","grey73","darkgoldenrod4")) +
  labs(x = "", y = "# Medals Won") +
  facet_wrap(~Country) +
  theme_light() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(strip.background = element_rect(fill = "lightskyblue2")) 
medcountry

The below plot worked for many days until it didn’t anymore. I would love to send this one to the editor, but need to diagnose what went wrong first.

# The winning "over time" plot
# medals_year %>%
#   ggplot(aes(x = Year, y = medal_yr_cnt, na.rm = TRUE)) +
#   geom_area(aes( fill = Medal), alpha = 0.8) +
#   scale_fill_manual(values=c("gold","grey73","darkgoldenrod4")) +
#   labs(x = "Year", y = "Medals Won",
#        title = "Olympic Medals by Country",
#        subtitle = "Winter Olympics 1924 - 2014") +
#   facet_wrap(~Country) +
#   theme_light() +
#   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
#   theme(strip.background = element_rect(fill = "lightskyblue2"))
Medals Over Time

Medals Over Time

Total Medal Visualizations

To visualize the total amount of medals won by the top 10 medal-winning countries, I produced a lollipop chart for a quick glance, then a bar chart with the same gold, silver and bronze coloring to add more information and make the stacks immediately obvious to the reader.

library(ggalt)
top10 %>% 
  ggplot(aes(x = reorder(Country, total_medals), y = total_medals)) +
  geom_lollipop(point.colour = "lightblue", point.size = 2) +
  labs(x = "", y = "Total Medals", 
       title = "Winter Olympics", 
       subtitle = "Winter Olympics 1924 - 2014: Top 10 Medal-Winning Countries") +
  coord_flip() +
  theme_minimal()

# Viz for top medal producing countries, at least 100 medals 
totalmedals_plot <- olymp %>% 
  mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze"))) %>%
  group_by(Country, Medal) %>% 
  filter(Country %in% medals100$Country) %>% 
  summarize(total_medals = length(Medal))

#png("../fig/totalmedals.png")
totalmedals <- totalmedals_plot %>% 
  ggplot(aes(x = reorder(Country, total_medals), y = total_medals, 
             fill = Medal, group = Medal)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = total_medals), position = position_stack(vjust = 0.5),
            size = 3, color = "white") +
  scale_fill_manual(values=c("gold","grey73","darkgoldenrod4")) +
  labs(x = "", y = "", 
       title = "Olympic Medals Overall", 
       subtitle = "Winter Olympics 1924 - 2014: Countries with Over 100 Medals") +
  theme_light()
totalmedals

2. Medal Counts adjusted by Population, GDP

I created three separate rankings of success by GDP per capita, population, and total number of medals won. I then chose to visualize this data in a few different ways. First, I visualized the percentage of medals won divided by the population of each country. Countries are ordered by GDP/Capita ranking, thus, countries with high GDP/Capita and low percentage of medals/population are highlighted in yellow. LUX, for instance, has the highest GDP/Capita yet a very low percentage of medals/population. SUI and NOR’s success makes sense, given they have high GDP/Capita and high percentage of medals/population. The USA, however, could do better!

# Data prep
by_pop_gdp <- descriptive_gdp %>% 
  arrange(desc(total_medals)) %>% 
  mutate(medal_rank = row_number()) %>% 
  arrange(desc(GDP.per.Capita), desc(total_medals)) %>% 
  mutate(gdp_medal_rank = row_number()) %>% 
  arrange(desc(Population), desc(total_medals)) %>% 
  mutate(pop_medal_rank = row_number()) %>% 
  filter(medal_rank <10 | gdp_medal_rank <10 |pop_medal_rank <10)
head(by_pop_gdp)
## # A tibble: 6 x 12
##   Country  Gold Silver Bronze num_olymp total_medals Country_Full 
##   <chr>   <dbl>  <dbl>  <dbl>     <int>        <dbl> <fct>        
## 1 CHN      16.0   30.0   36.0         7         82.0 China        
## 2 USA     167    319    167          22        653   United States
## 3 RUS     344    187    172          15        703   Russia       
## 4 JPN      17.0   22.0   24.0        12         63.0 Japan        
## 5 GER     226    208    203          20        637   Germany      
## 6 FRA      36.0   35.0   81.0        21        152   France       
## # ... with 5 more variables: Population <int>, GDP.per.Capita <dbl>,
## #   medal_rank <int>, gdp_medal_rank <int>, pop_medal_rank <int>
# GDP Viz
# by_pop_gdp %>% 
#   ggplot(aes(x = reorder(Country, gdp_medal_rank), y = total_medals)) +
#   geom_bar(aes(fill = total_medals < 20), stat = "identity") +
#   scale_fill_manual(values=c("grey","gold")) +
#   labs(x = "Country in Order of GDP/Capita Ranking", y = "Medals Won Overall", 
#        title = "Some Countries Should Be Doing Better", 
#        subtitle = "Winter Olympic Medals by Countries in Order of Highest GDP/Capita") +
#   theme_light() +
#   theme(legend.position = "none")

# GDP Viz
by_pop_gdp %>% 
  ggplot(aes(x = reorder(Country, gdp_medal_rank), y = (total_medals/Population * 100))) +
  geom_bar(aes(fill = (total_medals/Population* 100) < 0.0025), stat = "identity") +
  scale_fill_manual(values=c("grey","gold")) +
  labs(x = "Country in Order of GDP/Capita Ranking", y = "% of Medal Winners/Pop", 
       title = "Some Countries Should Be Doing Better", 
       subtitle = "Winter Olympic Medals/Population of Countries") +
  theme_light() +
  theme(legend.position = "none") 

The below viz shows the countries lined up by population size and compares how many medals they’ve won in total. Countries with the fewest medals are “shamed” by being highlighted in yellow.

# Pop Viz 
by_pop_gdp %>% 
  ggplot(aes(x = reorder(Country, pop_medal_rank), y = total_medals)) +
  geom_bar(aes(fill = total_medals < Population/500000), stat = "identity") +
  scale_fill_manual(values=c("grey","gold")) +
  labs(x = "Country in Order of Largest Population", y = "Medals Won Overall", 
       title = "Some Countries Should Be Doing Better", 
       subtitle = "Countries that have enough people to win more medals") +
  theme_light() +
  theme(legend.position = "none")

I adjusted the viz such that we can see the countries with the highest GDP/Capita and largest populations are visualized with how many medals they’ve won. In this case, larger bubbles mean more overall medals won. I’d argue that it doesn’t make a ton of sense to compare GDP/Capita to Population, so I’d scrap this.

by_pop_gdp %>% 
  ggplot(aes(x = log(Population), y = log(GDP.per.Capita))) +
  geom_count(aes(color = Country, size = total_medals, fill = Country, alpha = 0.7), show.legend = FALSE) +
  scale_size_area(max_size = 20) +
  geom_text(aes(label = Country), check_overlap = TRUE, size = 3) +
  theme_light() +
  theme(axis.text = element_blank()) +
  theme(axis.ticks = element_blank()) +
  labs(x = expression(atop("Population", paste(symbol('\256')))), 
       y = expression(atop("GDP/Capita",paste(symbol('\256')))), 
       title = "Some Countries Should Be Doing Better", 
       subtitle = "Winter Olympics 1924 - 2014: Medals per Pop & GDP of Country",
       caption = "Bigger Bubbles = More Medals Won") +
  theme(axis.title.y = element_text(size = 12)) + 
  theme(axis.title.x = element_text(size = 12)) 

The following viz points out China, specifically, as a country with both a lot of people and a lot of money, but we can see from the size of its bubble that China is not a very big medal producer. They could also be doing better!

by_pop_gdp %>% 
  ggplot(aes(x = log(Population), y = log(GDP.per.Capita * Population))) +
  geom_count(aes(color = Country, size = total_medals, fill = Country, alpha = 0.7), show.legend = FALSE) +
  scale_size_area(max_size = 20) +
  geom_text(aes(label = Country), check_overlap = TRUE, size = 3) +
  theme_light() +
  theme(axis.text = element_blank()) +
  theme(axis.ticks = element_blank()) +
  labs(x = expression(atop("Population", paste(symbol('\256')))), 
       y = expression(atop("GDP",paste(symbol('\256')))), 
       title = "China Should Be Doing Better", 
       subtitle = "Winter Olympics 1924 - 2014: Medals per Pop & GDP of Country",
       caption = "Bigger Bubbles = More Medals Won") +
  theme(axis.title.y = element_text(size = 12)) + 
  theme(axis.title.x = element_text(size = 12)) 

This final visualization is the second one I would recommend to my editor. I feel the x and y measurements make sense with each other, and the bubbles are fairly aesthetically pleasing. It’s clear that Norway and Finland do well for themselves given the large amount of medals they’ve won despite their smaller populations.

#png("../fig/norwaymedals.png")
norway <- by_pop_gdp %>% 
  ggplot(aes(x = log(Population), y = total_medals/Population *100)) +
  geom_count(aes(color = Country, size = total_medals, 
                 fill = Country, alpha = 0.7), show.legend = FALSE) +
  #scale_fill_manual(values=c("purple","yellow")) +
  scale_size_area(max_size = 20) +
  geom_text(aes(label = Country), check_overlap = TRUE, size = 3) +
  theme_classic() +
  theme(axis.text = element_blank()) +
  theme(axis.ticks = element_blank()) +
  labs(x = expression(atop("Population", paste(symbol('\256')))), 
       y = expression(atop("% of Medals per Pop",paste(symbol('\256')))), 
       title = "Norway & Finland Citizens Dominate", 
       subtitle = "Winter Olympics 1924 - 2014: Percentage of Medals Won per Population",
       caption = "Bigger Bubbles = More Medals Won Overall") +
  theme(axis.title.y = element_text(size = 12)) + 
  theme(axis.title.x = element_text(size = 12)) 
  #geom_encircle(data = subset(by_pop_gdp, (total_medals/Population *100 > 0.0040)))
norway

3. Host Country Advantage

I calculated whether the countries won more medals when they hosted Winter Olympics as opposed to when they were outside participants. To do so, I downloaded necessary country host information from Wikipedia below.

library(rvest)
library(stringr)
wiki_hosts <- read_html("https://en.wikipedia.org/wiki/Winter_Olympic_Games")
hosts <- html_table(html_nodes(wiki_hosts, "table")[[5]], fill=TRUE)
hosts <- hosts[-1,1:3]
hosts$city <- str_split_fixed(hosts$Host, n=2, ",")[,1]
hosts$country <- str_split_fixed(hosts$Host, n=2, ",")[,2]
hosts$Year <- as.Date(strptime(hosts$Year, format = "%Y"))
hosts$Year <- format(hosts$Year, '%Y')
medals_year$Year <- format(medals_year$Year, '%Y') # weird issues with year 

hosts <- hosts %>% 
  select(Year, country)
colnames(hosts)[colnames(hosts) == "country"] <- "Host_Country"

# Join tables for Full Country Name, Year, number of medals won and whether they hosted
hosts <- merge(medals_year, hosts, on = "Year")
hosts <- merge(hosts, descriptive_gdp, on = "Country")
hosts$Host_Country <- as.character(hosts$Host_Country)
hosts$Country_Full <- as.character(hosts$Country_Full)
hosts$Host_Country <- trimws(hosts$Host_Country)
hosts <- hosts %>% 
  select(Year, Country, Country_Full, Medal, medal_yr_cnt, Host_Country) %>% 
  mutate(hosted = ifelse(Country_Full == Host_Country, 1, 0)) %>% 
  spread(key = Medal, value = medal_yr_cnt)
hosts[is.na(hosts)] <- 0 
hosts$total_medals <- rowSums(hosts[, c(6,7,8)])

I began with a wrapped bar chart for each country that has ever hosted in the Olympics. I chose stark colors (grey and blue) so the reader can easily see a country’s total number of medals won when they were hosts versus when they were not hosts. It’s clear from the visualization that most countries do exceedingly better when they host.

#png("../fig/hostplot1.png")
hosts %>% 
  filter(Country_Full %in% hosts$Host_Country) %>% 
  ggplot(aes(x = Year, y = total_medals)) +
  geom_histogram(aes(fill = as.factor(hosted)), stat = "identity") +
  facet_wrap(~Country_Full, nrow = 4) +
  theme_minimal() +
  theme(axis.text = element_blank()) +
  theme(axis.ticks = element_blank()) +
  theme(strip.background = element_rect(fill = "lightskyblue2")) +
  scale_fill_manual(values=c("gray87","lightseagreen"), labels = c("Medals; Not Hosting", "Host")) +
  labs(x = "1924 - 2014", y = "", 
       title = "Home Court Advantage", 
       subtitle = "Host Countries of the Winter Olympics 1924 - 2014",
       fill = "Medals Won Per Year") +
  theme(text = element_text(size = 11))

A downside of the next plot I created is that it’s slightly harder to compare the amount of medals won each year, but I find it particularly aesthetically pleasing. This plot includes only countries that have hosted an Olympics and shows how many medals each country won at each winter Olympics. Tiles outlined in black indicate that the country hosted that year.

#png("../fig/hostplot2.png")
palette <- colorRampPalette(brewer.pal(9, 'GnBu'), space='Lab', bias = 5)
hosts$hosted <- as.factor(hosts$hosted)
home.court <- hosts %>% 
  filter(Country_Full %in% hosts$Host_Country) %>% 
  ggplot(aes(x = Year, y = Country)) +
  geom_raster(aes(fill = total_medals)) +
  geom_tile(aes(color = hosted), fill = "#00000000", size = 1, show.legend = FALSE) +
  theme_light() +
  scale_fill_gradientn(colors = palette(9)) +
  scale_color_manual(values = c('#00000000', 'black')) +
  labs(x="", y="", fill="Medals", 
       title = "Home Court Advantage", 
       subtitle = "Host Countries of the Winter Olympics 1924 - 2014", 
       caption = "Hosting country indicated by outline. ") +
  theme(
        legend.text=element_text(color="grey20"),
        axis.text.x=element_text(size=8),
        axis.ticks.y=element_blank(),
        axis.ticks.x = element_blank(),
        panel.grid=element_blank(),
        plot.margin = unit(c(.5,1,0.3,1), "cm")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
  home.court

4. Country success by sport / discipline / event

First, I wanted to compare USA and Canadian Olympic performance especially because of their hockey rivalry. I adjusted the dataset to account for the fact that hockey is a large team sport and calculated the total number of medals as just one per win rather than per player. I visualized the number of medals won per sport between the countries two different ways. I’d suggest to my editor to scrap the first – I don’t find it as visually appealing as the second.

# calculate the large team sport, hockey, as just one medal per team rather than person
sports_per_game <- olymp %>%
  group_by(Country, Year, Sport, Medal) %>%
  select(Country, Year, Sport, Medal) %>%
  unique() %>%
  summarize(num = n()) %>%
  group_by(Country, Sport, Medal) %>%
  mutate(num_olymp_game = sum(num)) %>%
  ungroup() %>%
  select(Country, Sport, Medal, num_olymp_game) %>%
  unique()

sports <- olymp %>% 
  mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze"))) %>% 
  group_by(Year, Country, Sport) %>% 
  group_by(Country, Sport, Medal) %>% 
  mutate(medal_sport_cnt = as.integer(length(Medal))) %>% 
  select(Country, Sport, Medal, medal_sport_cnt) %>% 
  unique() %>% 
  merge(sports_per_game, on = "Country") %>% 
  mutate(adj_medals = ifelse(Sport == "Ice Hockey", num_olymp_game, medal_sport_cnt))

# Canada vs. USA Medals
sports %>% 
  filter(Country == "USA" | Country == "CAN") %>% 
  group_by(Country, Sport) %>% 
  summarize(total_medals = sum(adj_medals)) %>% 
  ggplot(aes(x = Sport, y = total_medals)) +
  geom_bar(aes(color = Country, fill = Country), stat = "identity", alpha = 0.8) + 
  coord_flip() +
  labs(x= "",
       y= " ",
       title = "U.S. vs. Canadian Olympic Medals", 
       subtitle = "Winter Olympics 1924 - 2014") +
  theme_light()

I’d recommend using the following visualization to compare Canada and USA’s Olympic performance by sport (discipline, to be exact). USA’s advantage over Canada is clear. To improve this plot, I’d add an interactive overlay to indicate the precise amount of medals per sport layer.

sports %>% 
  filter(Country == "USA" | Country == "CAN") %>% 
  group_by(Country, Sport) %>% 
  summarize(total_medals = sum(adj_medals)) %>% 
  ggplot(aes(x = Country, y = total_medals)) +
  geom_bar(aes(fill = Sport), stat = "identity", alpha = 0.9) + 
  scale_fill_brewer(palette = "Reds") +
  labs(x= "",
       y= " ",
       title = "U.S. vs. Canadian Olympic Medals", 
       subtitle = "Winter Olympics 1924 - 2014") +
  theme_light()

To best visualize the top 10 medal-winning countries, I’d suggest using the visualization below. It provides the reader with even more information but in a (hopefully) digestible way – the reader can tell how many of each medal type each country won overall in each discipline. The top ten countries happen to be best at Skiing and Skating sports as indicated by the darker blues.

# Top 10 Medal Winning Countries Split by Sport
#png("../fig/sports.png")
palette <- colorRampPalette(brewer.pal(9, 'GnBu'), space='Lab', bias = 5)
sports <- sports %>% 
  filter(Country %in% top10$Country) %>% 
  ggplot(aes(x = Sport, y = Medal, fill = adj_medals)) + 
  geom_tile(color = "white") + 
  facet_grid(~Country) + 
  scale_fill_gradientn(colors = palette(9)) +
  theme_light() +
  coord_flip() +
  theme(strip.background = element_rect(fill = "lightskyblue2")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  theme(panel.grid=element_blank()) +
  labs(x= "",
       y= "Medal Types",
       title = "Top 10 Medal Producing Countries by Sport", 
       subtitle="Winter Olympics 1924 - 2014", 
       fill="# of Medals")
sports

5. Most successful athletes

Below is a visual display of the most successful winter Olympic athletes of all time. Norway tends to produce the top winning winter athletes, which is not surprising given Norway’s clear success in prior plots.

athletes <- olymp %>% 
  mutate(Medal = factor(Medal, levels = c("Gold", "Silver", "Bronze"))) %>%
  group_by(Athlete, Gender, Medal) %>% 
  select(Country, Athlete, Discipline, Sport) %>%
  mutate(total_medals_per = length(Medal)) %>%
  ungroup()  %>% 
  unique()
athletes <- athletes %>% 
  group_by(Athlete, Gender, Country) %>% 
  mutate(totalmedals = sum(total_medals_per)) %>% 
  select(Athlete, Gender, Country, Discipline, Sport, totalmedals) %>% 
  unique() 
athletes$Discipline <- as.factor(athletes$Discipline)
athletes$Athlete <- as.factor(athletes$Athlete) 
athletes$totalmedals <- as.integer(athletes$totalmedals)

# top 20 overall
top_20 <- athletes %>% 
  select(Athlete, Gender, totalmedals, Sport, Discipline) %>% 
  unique() %>% 
  arrange(desc(totalmedals)) 
top_20 <- top_20[-10, ] # ridding of second country, looked up most represented country
top_20 <- top_20[1:20, ] 
top_20$Discipline <- as.character(top_20$Discipline)
top_20[3, 6] <- "Multiple Sports"
top_20[4, 6] <- "Multiple Sports"
top_20 <- unique(top_20)

# top 10 female
top_10_F <- athletes %>% 
  filter(Gender == "Women") %>% 
  arrange(desc(totalmedals))
top_10_F <- top_10_F[-5, ] 
top_10_F <- top_10_F[1:10, ]

# top 10 men
top_10_M <- athletes %>% 
  filter(Gender == "Men") %>% 
  arrange(desc(totalmedals))
top_10_M$Discipline <- as.character(top_10_M$Discipline)
top_10_M[3, 4] <- "Multiple Sports"
top_10_M[4, 4] <- "Multiple Sports"
top_10_M <- unique(top_10_M)
top_10_M <- top_10_M[1:10, ]

#png("../fig/athletes.png")
athletes <- top_20 %>% 
  ggplot(aes(reorder(x = Athlete, totalmedals), y = totalmedals)) +
  geom_bar(aes(fill = Country == "NOR"), alpha = 0.6, stat = "identity") +
  scale_fill_manual(values=c("grey","red"), labels = c("Other Country", "Norway")) +
  coord_flip() +
  theme_light() +
  labs(x= "",
       y= "Overall Medals",
       title = "Top 20 Medal-Winning Olympians: Norway Dominates
       ", 
       subtitle = "Winter Olympics 1924 - 2014") +
  theme(legend.title = element_blank())
athletes

I then split the athletes by gender. From the plots, we can see that women dominate in Cross Country Skiing while men dominate in the Biathlon.

#png("../fig/top10F.png")
 top10F <- top_10_F %>% 
  ggplot(aes(reorder(x = Athlete, totalmedals), y = totalmedals)) +
  geom_bar(aes(fill = Discipline), alpha = 0.7, stat = "identity") +
  coord_flip() +
  theme_light() +
  theme(legend.position = "bottom") +
  labs(x= "",
       y= "Total Medals",
       title = "Top 10 Female Medal-Winning Olympians", 
       subtitle = "Winter Olympics 1924 - 2014") +
  scale_y_continuous(breaks = pretty_breaks()) +
  scale_fill_brewer(palette = "Dark2", name = "Sport")
 top10F

#png("../fig/top10M.png")
  top10M <- top_10_M %>% 
  ggplot(aes(reorder(x = Athlete, totalmedals), y = totalmedals)) +
  geom_bar(aes(fill = Discipline), alpha = 0.7, stat = "identity") +
  coord_flip() +
  theme_light() +
  theme(legend.position = "bottom") +
  labs(x= "",
       y= "Total Medals",
       title = "Top 10 Male Medal-Winning Olympians", 
       subtitle = "Winter Olympics 1924 - 2014") +
  scale_y_continuous(breaks = pretty_breaks()) +
  scale_fill_brewer(palette = "Dark2", name = "Sport")
 top10M

 #dev.off()

#source("../lib/multiplot.R") 
# layout <- matrix(c(1, 1, 2, 2, 1, 1, 2, 2), 2, 4, byrow = TRUE)
# multiplot(top10F, top10M, layout = layout) 

Interactivity

6. Medals per Population

For my first interactive plot, I chose to add additional information to my bubble chart that represented the number of medals per population of each country. Prior to adding interactivity, the scale for total medals won was unclear. The below plot allows the user to hover over each bubble and get a clearer picture of how many overall medals the country won on a scale .

library(plotly)
# Had to rearrange the ggplot to get rid of arrow element 
colnames(by_pop_gdp)[colnames(by_pop_gdp) == "total_medals"] <- "Total_Medals"
colnames(by_pop_gdp)[colnames(by_pop_gdp) == "Country"] <- "Code"
colnames(by_pop_gdp)[colnames(by_pop_gdp) == "Country_Full"] <- "Country"
norway2 <- by_pop_gdp %>% 
  mutate(medspop = Total_Medals/Population * 100) %>% 
  ggplot(aes(x = log(Population), y = Total_Medals/Population *100)) +
  geom_count(aes(color = Code, size = Total_Medals, 
                 fill = Code, alpha = 0.7, label = Country), show.legend = FALSE) +
   labs(x = "Population", 
       y = "% of Medals per Pop", 
       title = "Norway & Finland Citizens Dominate", 
       subtitle = "Winter Olympics 1924 - 2014: Percentage of Medals Won per Population",
       caption = "Bigger Bubbles = More Medals Won Overall") +
  scale_size_area(max_size = 20) +
  geom_text(aes(label = Code), check_overlap = TRUE, size = 3) +
  theme_classic() +
  theme(axis.text = element_blank()) +
  theme(axis.ticks = element_blank()) +
  theme(axis.title.y = element_text(size = 12)) + 
  theme(axis.title.x = element_text(size = 12)) +
  theme(legend.position = 'none')
  

plotly1 <- ggplotly(p = norway2, tooltip = c("Country", "Total_Medals"))
plotly1
#api_create(plotly1, filename = "norway_pop-plotly")

7. Host Country Advantage

My second interactive plot (created with HighCharter) I modeled off of my first heat map host country visualization. The user can hover over each heatmap tile to see whether the country hosted the Olympics that particular year and how many medals the country won.

#library("viridis")
library(highcharter)
x <- c("Total Medals", "Host", "Year")
y <- sprintf("{point.%s}", c("total_medals", "hosted", "Year"))
tltip <- tooltip_table(x, y)


hosts2 <- hosts %>% 
  filter(Country_Full %in% Host_Country) 
hosts2$hosted <- ifelse(hosts2$hosted == 1, "Yes", "No")

hosthc <- hchart(hosts2, "heatmap", hcaes(x = Year, y = Country_Full, value = total_medals)) %>% 
  hc_colorAxis(stops = color_stops(10, (palette(10))),
               type = "logarithmic") %>% 
  hc_tooltip(useHTML = TRUE, pointFormat = tltip) %>% 
  hc_title(text = "Home Court Advantage") %>% 
  hc_legend(layout = "vertical", verticalAlign = "top",
            align = "right", valueDecimals = 0) %>% 
  hc_size(height = 800)
hosthc

8. Data Table

I chose to create a data table that provides the reader with information about each contending country in the Winter Olympics, including the country’s population, how many winter games the country has competed and medaled in, and the number of Gold, Silver and Bronze medals won. The small colored bars are supposed to serve as a visual representation of the proportion of medals that were Gold, Silver and Bronze.

library(DT)
descriptive_pop <- descriptive_gdp %>% 
  select(Country_Full, Population, num_olymp, Gold, Silver, Bronze, total_medals) 
descriptive_pop$total_medals <- as.integer(descriptive_pop$total_medals)

## Borrowed the sketch Code from: https://rstudio.github.io/DT/
sketch = htmltools::withTags(table(
  class = 'display',
  thead(
    tr(
      th(rowspan = 2, 'Country'),
      th(colspan = 2, 'Olympics'),
      th(colspan = 4, 'Medals')
    ),
    tr(
      lapply(c('Population', 'Number of Games', 'Gold', 'Silver', 'Bronze', 'Total'), th)
    )
  )
))

## Data Table 
options(DT.options = list(pageLength = 10, autoWidth = TRUE, columnDefs = list(list(width = '80px', targets = c(1, 2, 3, 4, 5, 6)))))
desc_dt <- descriptive_pop %>%
    datatable(rownames = FALSE, filter = list(position = "top"),
              colnames = c("Country" = "Country_Full", "# Winter Olympics Medaled" = "num_olymp", 
                           "# Gold Medals" = "Gold", "# Silver Medals" = "Silver",
                           "# Bronze Medals" = "Bronze",
                           "# Total Medals" = "total_medals"),
              width = 700, height = 600,
              options = list(language = list(sSearch = "Filter:")),
              
              container = sketch,
              caption = htmltools::tags$caption(
                # style borrowed from same R Studio Github above
              style = 'caption-side: bottom; text-align: center;', 
              'Table 1: ', htmltools::em('Winter Olympics 1924 - 2014.' ))) %>% 
    formatStyle('Country', fontWeight = 'bold') %>% 
    formatStyle(columns = c(2,3,4,5,6,7), fontSize = '80%') %>% 
    formatStyle('# Total Medals', fontWeight = 'bold', backgroundColor = "lightblue") %>% 
    formatStyle('# Gold Medals',
      background = styleColorBar(range(descriptive_pop$total_medals), 'gold'),
      backgroundSize = '90% 80%',
      backgroundRepeat = 'no-repeat',
      backgroundPosition = 'center') %>% 
    formatStyle('# Silver Medals',
      background = styleColorBar(range(descriptive_pop$total_medals), 'silver'),
      backgroundSize = '90% 80%',
      backgroundRepeat = 'no-repeat',
      backgroundPosition = 'center') %>% 
    formatStyle('# Bronze Medals',
      background = styleColorBar(range(descriptive_pop$total_medals), 'brown'),
      backgroundSize = '90% 80%',
      backgroundRepeat = 'no-repeat',
      backgroundPosition = 'center')

desc_dt